from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from PIL import Image
import pytesseract # pip install pytesseract

# 验证码识别
def code_tesseract(img_checkcode,checkcode):
    driver.save_screenshot('html.png')
    yzm=driver.find_element_by_id(img_checkcode)
    location=yzm.location#获取验证码x,y轴坐标
    size=yzm.size#获取验证码的长宽
    rangle=(int(location['x']),int(location['y']),int(location['x']+size['width']),int(location['y']+size['height']))#截取的位置坐标
    i=Image.open("html.png") #打开截图
    frame4=i.crop(rangle) #使用Image的crop函数，从截图中再次截取我们需要的区域
    frame4.save('code.png')#将截取到的验证码保存为jpg图片
    text = pytesseract.image_to_string(Image.open("code.png")).strip()
    driver.find_element_by_id(checkcode).clear()
    driver.find_element_by_id(checkcode).send_keys(text)

# 获取每个资助类别和不同年的数据
def get_data(grantCode,year):
    # 页数获取
    dataBar=driver.find_element_by_id('sp_1_dataBar').text
    page_sum =int(dataBar) if dataBar else 0
    # 数据采集
    if page_sum:
        f = open(f'data_{grantCode}_{year}.csv', 'w', encoding='utf-8')
        f.write('prjNo,subjectCode,ctitle,psnName,orgName,totalAmt,startEndDate\n')
        for i in range(page_sum):
            table = driver.find_element_by_id('dataGrid')
            table_rows = table.find_elements_by_tag_name('tr')
            for row in range(1, len(table_rows)):
                prjNo = table_rows[row].find_elements_by_tag_name('td')[1].text
                subjectCode = table_rows[row].find_elements_by_tag_name('td')[2].text
                ctitle = table_rows[row].find_elements_by_tag_name('td')[3].text
                psnName = table_rows[row].find_elements_by_tag_name('td')[4].text
                orgName = table_rows[row].find_elements_by_tag_name('td')[5].text
                totalAmt = table_rows[row].find_elements_by_tag_name('td')[6].text
                startEndDate = table_rows[row].find_elements_by_tag_name('td')[7].text
                f.write(f'{prjNo},{subjectCode},{ctitle},{psnName},{orgName},{totalAmt},{startEndDate}\n')
            if i<page_sum:
                # 验证码操作
                code_tesseract("img_checkcode", "checkCode")
                # 点击下一页
                driver.find_element_by_id('next_dataBar').click()
                try:
                    while WebDriverWait(driver, 3).until(EC.visibility_of_element_located(param)):
                        driver.find_element_by_id('img_checkcode').click()
                        code_tesseract("img_checkcode", "checkCode")
                        driver.find_element_by_id('next_dataBar').click()
                except TimeoutException as e:
                    pass
if __name__ == '__main__':
    url='https://isisn.nsfc.gov.cn/egrantindex/funcindex/prjsearch-list'
    driver=webdriver.Firefox(executable_path='geckodriver.exe') # firefox版本：80
    driver.get(url)
    #资助类别选择
    sel_1 = driver.find_element_by_id("f_grantCode")
    #年限选择
    sel_2 = driver.find_element_by_id("f_year")
    for i in range(3,len(sel_1.find_elements_by_tag_name('option'))):
        grantCode=sel_1.find_elements_by_tag_name('option')[i].text
        print(grantCode)
        for j in range(len(sel_2.find_elements_by_tag_name('option'))):
            Select(sel_1).select_by_index(i)
            year=sel_2.find_elements_by_tag_name('option')[j].text
            print(year)
            Select(sel_2).select_by_index(j)
            # 验证码操作
            code_tesseract("img_checkcode","f_checkcode")
            #搜索确定
            driver.find_element_by_id('searchBt').click()
            ele_locator = "scmtip_content"
            param = (By.ID,ele_locator)
            try:
                # 判断是否会弹窗，如果弹出，表示验证码错误
                while WebDriverWait(driver,3).until(EC.visibility_of_element_located(param)):
                    driver.find_element_by_id('img_checkcode').click()
                    code_tesseract("img_checkcode", "f_checkcode")
                    driver.find_element_by_id('searchBt').click()
            except TimeoutException as e:
                # 表示验证码正确
                pass
            # 数据采集
            get_data(grantCode,year)
            # 回到首页
            driver.get(url)
            # 资助类别选择
            sel_1 = driver.find_element_by_id("f_grantCode")
            # 年限选择
            sel_2 = driver.find_element_by_id("f_year")
        print('*'*100)
    #浏览器退出
    driver.quit()